#include <sys/cdefs.h>
#include <cstdio>
#ifdef __sw_64_sw2__
#define __sw_thl__
#elif defined(__sw_64_sw7a__)
#define __sw_ocn__
#endif
#ifdef __sw_ocn__
typedef int desc_t __attribute__ ((__mode__(__V1XI__)));
#elif defined(__sw_thl__)
typedef int desc_t __attribute__ ((__mode__(__V1OI__)));
#endif
desc_t tmp;
union literal_desc{
  int ival[sizeof(desc_t) / sizeof(int)];
  desc_t desc;
};
namespace dsc{
  static constexpr int DMA = 0;
  static constexpr int RMA = 1;
  static constexpr int PE = 0;
  static constexpr int BCAST_ROW = 1;
  static constexpr int BCAST_COL = 2;
  static constexpr int BCAST_ROW_ALL = 5;
  static constexpr int BCAST_COL_ALL = 6;
  static constexpr int PUT = 0;
  static constexpr int GET = 1;
};
template<int KIND, int MODE, int DIR>
struct descriptor{
  static constexpr int DMA = 0;
  static constexpr int RMA = 1;
  static constexpr int PE = 0;
  static constexpr int BCAST_ROW = 1;
  static constexpr int BCAST_COL = 2;
  static constexpr int BCAST_ROW_ALL = 5;
  static constexpr int BCAST_COL_ALL = 6;
  static constexpr int PUT = 0;
  static constexpr int GET = 1;

  __always_inline descriptor(int mask = 0, int stride = 0, int bsize = 0) {
    reply = 0;
    count = 0;

    static_assert(KIND == DMA || KIND == RMA, "descriptor kind must be descriptor::DMA or descriptor::RMA");
    //static_assert(MASK >= 0 && MASK <= 0xff, "mask must be in [0x00, oxff]");
    if (KIND == RMA){
      literal_desc initial_desc = literal_desc{.ival={0, MODE << 28 | DIR << 24, (int)(long)&reply | mask << 24, (int)(long)&reply}};
      desc = initial_desc.desc;
    } else if (KIND == DMA){
      literal_desc initial_desc = literal_desc{.ival={0, bsize | MODE << 28 | DIR << 24, (int)(long)&reply | mask << 24, stride}};
      desc = initial_desc.desc;
    }
  }
  template<typename TM, typename TL>
  __always_inline descriptor &ireq(TM *mem, TL *ldm, int count, int ACTIVE = 1){
    desc_t tmp;
    if (ACTIVE){
      if (KIND == DMA) {
	asm volatile("vinsw %[SIZE], %[DESC_IN], 0, %[DESC]\n\t"
		     "dma %[DESC], %[MEM], %[LDM]\n\t"
		     : [DESC]"=&r"(tmp), "+m"(reply)
		     : [MEM]"r"(mem), [LDM]"r"(ldm), [SIZE]"r"(count * sizeof(TL)), [DESC_IN]"0"(desc) : "memory");
      }
      else if (KIND == RMA) {
	asm volatile("vinsw %[SIZE], %[DESC_IN], 0, %[DESC]\n\t"
		     "rma %[DESC], %[MEM], %[LDM]\n\t"
		     : [DESC]"=&r"(tmp), "+m"(reply)
		     : [MEM]"r"(mem), [LDM]"r"(ldm), [SIZE]"r"(count * sizeof(TL)), [DESC_IN]"0"(desc) : "memory");
      }
    } else {
      asm volatile("":::"memory");
    }
    this->count ++;
    return *this;
  }

  __always_inline void syn(){
    //if (_MYID == 0) printf("%p\n", &reply);
    int tmp;
    asm volatile("1:\n\t"
		 "ldw %[TMP], %[RPL]\n\t"
		 "sextw %[TMP], %[TMP]\n\t"
		 "subw %[TMP], %[CNT], %[TMP]\n\t"
		 "bne %[TMP], 1b\n\t"
		 "memb\n\t"
		 : [RPL]"+m"(reply), [TMP]"=&r"(tmp)
		 : [CNT]"r"(count) : "memory");
  }
  // __always_inline void print(){
  //   int *t = (int*)&desc;
  //   printf("%x %x %x %x %x %x\n", t[0], t[1], t[2], t[3], t[4], t[5]);
  // }
  template<typename TM, typename TL>
  __always_inline descriptor &operator()(TM *mem, TL *ldm, int count, int ACTIVE = 1) {
    return ireq(mem, ldm, count, ACTIVE);
  }
  desc_t desc;
  volatile int reply, count;
};
